#install.packages("magrittr")
#install.packages("dplyr", dependencies=TRUE)
library(reshape2)
library(magrittr)
library(MASS)
### Load Data
data(OJ, package = "ISLR")
 # count rows
data <- na.omit(OJ)
nrow(data)
[1] 1070
head(data)
str(data)
'data.frame':   1070 obs. of  18 variables:
 $ Purchase      : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
 $ WeekofPurchase: num  237 239 245 227 228 230 232 234 235 238 ...
 $ StoreID       : num  1 1 1 1 7 7 7 7 7 7 ...
 $ PriceCH       : num  1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
 $ PriceMM       : num  1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
 $ DiscCH        : num  0 0 0.17 0 0 0 0 0 0 0 ...
 $ DiscMM        : num  0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
 $ SpecialCH     : num  0 0 0 0 0 0 1 1 0 0 ...
 $ SpecialMM     : num  0 1 0 0 0 1 1 0 0 0 ...
 $ LoyalCH       : num  0.5 0.6 0.68 0.4 0.957 ...
 $ SalePriceMM   : num  1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
 $ SalePriceCH   : num  1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
 $ PriceDiff     : num  0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
 $ Store7        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
 $ PctDiscMM     : num  0 0.151 0 0 0 ...
 $ PctDiscCH     : num  0 0 0.0914 0 0 ...
 $ ListPriceDiff : num  0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
 $ STORE         : num  1 1 1 1 0 0 0 0 0 0 ...
contrasts(data$Purchase)
   MM
CH  0
MM  1

1.a)

Create class variables

new_data <- data
new_dataY <- array(rep(0), dim=c(nrow(data),2))
for(row in 1:nrow(data)){
  if(new_data[row,1] == "CH"){
    new_dataY[row,1] = 1
  }else
    new_dataY[row, 2] = 1
}
head(new_dataY)
     [,1] [,2]
[1,]    1    0
[2,]    1    0
[3,]    1    0
[4,]    0    1
[5,]    1    0
[6,]    1    0
head(new_data)
#new_data[,19] <- new_dataY
#head(new_data)

Create class variables

# set CH as 1. 0 othervise
new_data$Purchase <- as.numeric(new_data$Purchase == "CH")
head(new_data)
# add new columns of categorization target variable
#ndata <- data.frame(data.matrix(cbind(new_data,new_dataY)))
#head(ndata)

Convert categorical to binary

new_data <- data.frame(data.matrix(new_data))

Split data randomly with 70/30 splits (train and test)

set.seed(123)
ind <- sample(c(TRUE, FALSE), nrow(new_data), replace=TRUE, prob=c(0.7, 0.3))
train_data <- new_data[ind, ]
test_data <- new_data[!ind, ]
train_data

Model

mod.ind <- lm(Purchase ~ ., data = train_data)
#mod.ind2 <- lm(lpurchase ~., data = train_data)
#summary(mod.ind)
mod.ind

Call:
lm(formula = Purchase ~ ., data = train_data)

Coefficients:
   (Intercept)  WeekofPurchase         StoreID         PriceCH         PriceMM          DiscCH  
      0.193112        0.002708        0.041636       -0.915222        0.457200       -3.368190  
        DiscMM       SpecialCH       SpecialMM         LoyalCH     SalePriceMM     SalePriceCH  
     -5.021630       -0.038520       -0.056174        0.972165              NA              NA  
     PriceDiff          Store7       PctDiscMM       PctDiscCH   ListPriceDiff           STORE  
            NA       -0.160445        9.939498        7.006355              NA              NA  

Predict

pred1 <- predict(mod.ind, newdata = test_data)
prediction from a rank-deficient fit may be misleading
head(pred1)
        2         4         5         8        11        16 
0.5486662 0.3037390 0.9368644 1.0066937 1.0278263 0.7779361 

Results

predicted <- predict(mod.ind, newdata = test_data)
prediction from a rank-deficient fit may be misleading
TAB <- table(test_data$Purchase, predicted >= 0.5)
head(TAB)
   
    FALSE TRUE
  0    87   25
  1    33  167

Missclasification rate

mklrate<- 1-sum(diag(TAB))/sum(TAB)
mklrate
[1] 0.1858974

Missclasification rate with indicator variable is 18.5%

head(new_data)

1.b) LDA

I have tried transforming categorical and factor fariables to matrices with equal distances. Furthermore, I used principal component scores as new X data to resolve colinearity/singularity. #### One train one test set

mod.lda<-lda(Purchase~., data = train_data)
variables are collinear
mod.lda
Call:
lda(Purchase ~ ., data = train_data)

Prior probabilities of groups:
        0         1 
0.4023747 0.5976253 

Group means:
  WeekofPurchase  StoreID  PriceCH  PriceMM     DiscCH     DiscMM SpecialCH SpecialMM   LoyalCH
0       252.9443 3.242623 1.873967 2.074328 0.02560656 0.16924590 0.0852459 0.2590164 0.3203285
1       255.5254 4.448124 1.866799 2.099117 0.06532009 0.09523179 0.1810155 0.1214128 0.7324193
  SalePriceMM SalePriceCH  PriceDiff   Store7  PctDiscMM  PctDiscCH ListPriceDiff    STORE
0    1.905082    1.848361 0.05672131 1.196721 0.08079513 0.01323538     0.2003607 1.865574
1    2.003885    1.801479 0.20240618 1.421634 0.04605959 0.03452180     0.2323179 1.496689

Coefficients of linear discriminants:
                        LD1
WeekofPurchase   0.01103305
StoreID          0.06541999
PriceCH         -7.02382050
PriceMM         -5.71551820
DiscCH         -12.90159252
DiscMM         -10.41260809
SpecialCH       -0.15696440
SpecialMM       -0.22889930
LoyalCH          3.96141848
SalePriceMM      6.00531439
SalePriceCH      4.86765720
PriceDiff        4.04441675
Store7           0.07590531
PctDiscMM       40.50186192
PctDiscCH       28.54977401
ListPriceDiff   -2.47119502
STORE            0.10424162

Results

predicted <- predict(mod.lda, newdata = test_data)
TAB <- table(test_data$Purchase, predicted$class)
head(TAB)
   
     0   1
  0 87  25
  1 33 167

Missclasification rate

mklrate<- 1-sum(diag(TAB))/sum(TAB)
mklrate
[1] 0.1858974
train_data

Cross Validation

mypred = function(object,newdata) UseMethod("mypred", object)
mypred.lda <- function(object, newdata){
  predict(object, newdata=newdata)$class
}
library(ipred)
CEE = control.errorest(k = 5, nboot=10)
ldacvest <- errorest(Purchase ~ ., data = data, model=lda, est.para=CEE, predict=mypred)
variables are collinearvariables are collinearvariables are collinearvariables are collinearvariables are collinear
ldacvest

Call:
errorest.data.frame(formula = Purchase ~ ., data = data, model = lda, 
    predict = mypred, est.para = CEE)

     5-fold cross-validation estimator of misclassification error 

Misclassification error:  0.1682 

binary and/or categorical columns to convert

“StoreID”, “SpecialCH”, “SpecialMM”, “Store7”, “STORE”, “rowID”

converte columns of the whole dataset

new_data$rowID <- 1:nrow(new_data)
head(new_data)
# columns to convert
columns_to_convert <- names(new_data) %in% c("StoreID", "SpecialCH", "SpecialMM", "Store7", "STORE", "rowID")
#columns_to_convert
selected_columns_data <- new_data[columns_to_convert]
# new data column converted
new_data_cc <- recast(selected_columns_data, rowID ~ variable + value, id.var = c("rowID"), fun.aggregate = function(x) (length(x) > 0) + 0L)
head(new_data_cc)
original_columns <- new_data[!columns_to_convert]
new_data_columns_converted <- cbind(original_columns, new_data_cc)
new_data_columns_converted$rowID <- NULL
new_data_columns_converted

Split converted data randomly on 70/30 splits (train and test)

set.seed(123)
ind <- sample(c(TRUE, FALSE), nrow(new_data_columns_converted), replace=TRUE, prob=c(0.7, 0.3))
# train data converted columns
train_data_cc <- new_data_columns_converted[ind, ]
# test data converted columns
test_data_cc <- new_data_columns_converted[!ind, ]
train_data_cc

LDA on converted X

mod.lda2 <-lda(Purchase~., data = train_data_cc)
variables are collinear
mod.lda2
Call:
lda(Purchase ~ ., data = train_data_cc)

Prior probabilities of groups:
        0         1 
0.4023747 0.5976253 

Group means:
  WeekofPurchase  PriceCH  PriceMM     DiscCH     DiscMM   LoyalCH SalePriceMM SalePriceCH  PriceDiff
0       252.9443 1.873967 2.074328 0.02560656 0.16924590 0.3203285    1.905082    1.848361 0.05672131
1       255.5254 1.866799 2.099117 0.06532009 0.09523179 0.7324193    2.003885    1.801479 0.20240618
   PctDiscMM  PctDiscCH ListPriceDiff StoreID_1 StoreID_2 StoreID_3  StoreID_4 StoreID_7 SpecialCH_0
0 0.08079513 0.01323538     0.2003607 0.1836066 0.2459016 0.3049180 0.06885246 0.1967213   0.9147541
1 0.04605959 0.03452180     0.2323179 0.1258278 0.1633554 0.1125828 0.17660044 0.4216336   0.8189845
  SpecialCH_1 SpecialMM_0 SpecialMM_1  Store7_1  Store7_2   STORE_0   STORE_1   STORE_2   STORE_3
0   0.0852459   0.7409836   0.2590164 0.8032787 0.1967213 0.1967213 0.1836066 0.2459016 0.3049180
1   0.1810155   0.8785872   0.1214128 0.5783664 0.4216336 0.4216336 0.1258278 0.1633554 0.1125828
     STORE_4
0 0.06885246
1 0.17660044

Coefficients of linear discriminants:
                        LD1
WeekofPurchase   0.01009241
PriceCH         -6.78358856
PriceMM         -5.82065943
DiscCH         -12.44712324
DiscMM         -10.76476287
LoyalCH          3.95673884
SalePriceMM      6.23281734
SalePriceCH      4.69241556
PriceDiff        4.29746981
PctDiscMM       42.24886370
PctDiscCH       26.89856600
ListPriceDiff   -2.85129323
StoreID_1       -0.14704949
StoreID_2       -0.01182117
StoreID_3        0.01566897
StoreID_4        0.11328117
StoreID_7        0.02553317
SpecialCH_0      0.07473489
SpecialCH_1     -0.07473489
SpecialMM_0      0.12394666
SpecialMM_1     -0.12394666
Store7_1        -0.02553317
Store7_2         0.02553317
STORE_0          0.02553317
STORE_1         -0.14704949
STORE_2         -0.01182117
STORE_3          0.01566897
STORE_4          0.11328117
plot(mod.lda2)

Predict with converted X

predicted <- predict(mod.lda2, newdata = test_data_cc)
TAB <- table(test_data_cc$Purchase, predicted$class)
head(TAB)
   
     0   1
  0 86  26
  1 32 168

Missclasification rate (Converted X)

mklrate<- 1-sum(diag(TAB))/sum(TAB)
mklrate
[1] 0.1858974

PCA-LDA

names(train_data_cc[-1])
 [1] "WeekofPurchase" "PriceCH"        "PriceMM"        "DiscCH"         "DiscMM"        
 [6] "LoyalCH"        "SalePriceMM"    "SalePriceCH"    "PriceDiff"      "PctDiscMM"     
[11] "PctDiscCH"      "ListPriceDiff"  "StoreID_1"      "StoreID_2"      "StoreID_3"     
[16] "StoreID_4"      "StoreID_7"      "SpecialCH_0"    "SpecialCH_1"    "SpecialMM_0"   
[21] "SpecialMM_1"    "Store7_1"       "Store7_2"       "STORE_0"        "STORE_1"       
[26] "STORE_2"        "STORE_3"        "STORE_4"       
X_train = train_data_cc[-1]
X_test = test_data_cc[-1]
pca_object = princomp(X_train, center=TRUE)
summary(pca_object)
Importance of components:
                           Comp.1      Comp.2      Comp.3      Comp.4     Comp.5       Comp.6
Standard deviation     15.4412681 1.031326460 0.683781810 0.582903059 0.56523276 0.4886168271
Proportion of Variance  0.9881188 0.004407939 0.001937663 0.001408108 0.00132403 0.0009894187
Cumulative Proportion   0.9881188 0.992526786 0.994464449 0.995872557 0.99719659 0.9981860061
                             Comp.7       Comp.8       Comp.9      Comp.10      Comp.11      Comp.12
Standard deviation     0.4529214441 0.3371002340 0.2644087307 0.1670275410 1.291733e-01 6.652160e-02
Proportion of Variance 0.0008501372 0.0004709354 0.0002897309 0.0001156164 6.914948e-05 1.833871e-05
Cumulative Proportion  0.9990361433 0.9995070787 0.9997968097 0.9999124260 9.999816e-01 9.999999e-01
                            Comp.13      Comp.14      Comp.15      Comp.16      Comp.17 Comp.18 Comp.19
Standard deviation     3.990375e-03 2.184297e-03 4.335038e-09 3.140639e-09 5.334464e-10       0       0
Proportion of Variance 6.598887e-08 1.977273e-08 7.788057e-20 4.087702e-20 1.179301e-21       0       0
Cumulative Proportion  1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00       1       1
                       Comp.20 Comp.21 Comp.22 Comp.23 Comp.24 Comp.25 Comp.26 Comp.27 Comp.28
Standard deviation           0       0       0       0       0       0       0       0       0
Proportion of Variance       0       0       0       0       0       0       0       0       0
Cumulative Proportion        1       1       1       1       1       1       1       1       1
plot(pca_object)

biplot(pca_object)

Use scores as new X

X_train_scores <- pca_object$scores
head(X_train_scores)
       Comp.1     Comp.2     Comp.3     Comp.4       Comp.5     Comp.6     Comp.7     Comp.8      Comp.9
1  -17.492186 -0.5793143 0.10723536 0.95829098 -0.090953369  0.8259175 -0.3063367 -0.0314833  0.01056715
3   -9.490394 -0.5816134 0.02492143 0.95974270 -0.032556807  0.8837776 -0.3501329  0.1280174 -0.12811371
6  -24.478514  1.3599933 0.84170510 0.04996699 -0.302921417 -0.7428826 -0.4191345  0.7448345 -0.38997608
7  -22.474511  1.6707964 1.00088568 0.20046650 -0.387005707 -0.5713587  1.0646684  0.4864445 -0.42017807
9  -19.481040  1.4439389 0.24271302 0.01041531  0.001903890 -0.2875141 -0.2790435 -0.6266711 -0.40148980
10 -16.481215  1.4321803 0.23113787 0.01514940 -0.003798387 -0.2855126 -0.2835574 -0.6320358 -0.39632281
        Comp.10     Comp.11      Comp.12      Comp.13       Comp.14       Comp.15       Comp.16
1  -0.010539986 -0.02959065 -0.033227914  0.004570808  1.043249e-03 -8.604228e-16 -2.081668e-15
3   0.080059262 -0.15111491 -0.172008045  0.003485937 -1.640475e-03  4.135581e-15  6.133982e-15
6  -0.010017447  0.03405763  0.070250389  0.003809530  3.744344e-04 -4.232725e-16 -9.714451e-16
7  -0.237593205  0.07816845  0.083175250 -0.005541074  2.857936e-03 -6.529499e-15 -8.631984e-15
9  -0.004054360 -0.14541090 -0.016466460 -0.006747926  1.376130e-04 -1.255940e-15 -1.026956e-15
10  0.008266731 -0.13808417  0.004811616 -0.007057565  7.301851e-05 -1.255940e-15 -1.026956e-15
         Comp.17       Comp.18       Comp.19       Comp.20       Comp.21       Comp.22       Comp.23
1   1.700029e-16 -4.198031e-16 -2.463307e-16 -1.158795e-15 -7.619773e-16 -4.392320e-15  2.810252e-15
3   1.391248e-15 -3.195361e-15  1.141448e-15  6.245005e-17  2.513181e-15  3.513162e-14 -6.029899e-15
6   6.834810e-16 -1.169204e-15  3.469447e-18 -1.242062e-15 -1.039533e-15  2.546574e-15  1.353084e-15
7  -2.536166e-15  5.381112e-15 -1.883910e-15  5.065393e-16 -3.454268e-15 -5.796058e-14  8.514023e-15
9  -1.425943e-15  2.550044e-15 -6.071532e-16  1.366962e-15  3.343679e-16 -1.846440e-14  7.147061e-16
10 -1.425943e-15  2.550044e-15 -6.071532e-16  1.366962e-15  3.343679e-16 -1.846440e-14  7.147061e-16
         Comp.24       Comp.25       Comp.26       Comp.27       Comp.28
1   5.915407e-16 -5.516421e-16  1.346145e-15 -1.831868e-15  3.386180e-15
3   4.921411e-15  4.055783e-15 -2.539635e-15  1.249001e-15  1.998401e-15
6   1.812786e-15  3.712308e-16  1.304512e-15 -1.977585e-15  4.218847e-15
7  -9.067400e-15 -6.345618e-15  3.858025e-15 -1.033895e-15 -5.329071e-15
9  -4.237929e-15 -2.099015e-15 -1.665335e-16  1.602884e-15 -5.800915e-15
10 -4.237929e-15 -2.099015e-15 -1.665335e-16  1.575129e-15 -5.800915e-15
new_df_train_pc <- data.frame(X_train_scores)
new_df_test <- data.frame(X_test)
#colnames(new_df) = column_names
# train
new_df_train_pc["Purchase"] <- train_data_cc$Purchase

Project test data on principal components

test.data_pc <- predict(pca_object, newdata = new_df_test)
test.data_pc <- as.data.frame(test.data_pc)
test.data_pc["Purchase"] <- test_data_cc$Purchase
head(test.data_pc)
head(new_df_train_pc)

Predict with N principal components

mat.pc.lda <- lda(Purchase ~Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9, new_df_train_pc)
plot(mat.pc.lda)  

test.data_pc[c("Comp.1","Comp.2")]

Predict using Principal components

predicted <- predict(mat.pc.lda, newdata = test.data_pc[c("Comp.1","Comp.2","Comp.3","Comp.4","Comp.5","Comp.6","Comp.7","Comp.8","Comp.9")])
#predicted$class
TAB <- table(test.data_pc$Purchase, predicted$class)
head(TAB)
   
     0   1
  0 89  23
  1 32 168

Missclasification rate PCA-LDA on 9 first components

mklrate<- 1-sum(diag(TAB))/sum(TAB)
mklrate
[1] 0.1762821
head(new_df_train_pc)
head(test.data_pc)

1.c) QDA

Singularity problem can be solved by using principal components

mod.qda<-qda(formula = Purchase~Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10+Comp.11+Comp.12, data = new_df_train_pc)
predictqda <- predict(mod.qda, newdata = test.data_pc)

Missclasification rate with N principal components and QDA

#nrow(test_data)
#predictqda
TAB <- table(test.data_pc$Purchase, predictqda$class)
mkrqda<- 1-sum(diag(TAB)/sum(TAB))
mkrqda
[1] 0.1891026
head(train_data_cc)
head(test_data_cc)

1.d) Regularized discriminant analysis

mod.rda<-rda(Purchase~., data = train_data_cc)
#mod.rda$lambda
predictrda<-predict(mod.rda,test_data_cc)
#predictrda
TAB<-table(test_data_cc$Purchase, predictrda$class)
mkrrda<-1-sum(diag(TAB))/sum(TAB)
mkrrda
[1] 0.1891026

gamma and lambda

Lambda in range [0,1] is there to keep the degrees of freedom flexible. It is a compromise between LDA(alpha=0) and QDA(alpha=1). It is used to select between joint and independent group covariances.

Gamma is a regularization parameter that shrinks the covariance matrix towards the average eigenvalue.

Result with PCAs

mod.rda<-rda(Purchase ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10+Comp.11+Comp.12+Comp.13+Comp.14+Comp.15+Comp.16+Comp.17, data = new_df_train_pc, train.fraction=0.6)
mod.rda$lambda
NULL
predictrda<-predict(mod.rda,test.data_pc)
#predictrda
TAB<-table(test_data_cc$Purchase, predictrda$class)
mkrrda<-1-sum(diag(TAB))/sum(TAB)
mkrrda
[1] 0.1666667

Trying to estimate most important variables

mod<-lm(Purchase~.,data = new_df_train_pc)
summary(mod)

Call:
lm(formula = Purchase ~ ., data = new_df_train_pc)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.11740 -0.19875  0.00596  0.21954  0.99304 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  7.572e-01  7.630e-02   9.923  < 2e-16 ***
Comp.1       3.679e-04  1.611e-03   0.228  0.81945    
Comp.2       2.000e-01  4.594e-01   0.435  0.66349    
Comp.3       1.890e-01  3.068e-01   0.616  0.53811    
Comp.4      -1.721e-01  2.344e-01  -0.734  0.46320    
Comp.5       7.248e-01  3.142e-01   2.306  0.02137 *  
Comp.6      -5.873e-02  2.497e-01  -0.235  0.81410    
Comp.7       2.971e-01  6.465e-01   0.460  0.64597    
Comp.8       7.405e-01  2.766e-01   2.677  0.00759 ** 
Comp.9      -6.039e-01  1.017e+00  -0.594  0.55285    
Comp.10      2.058e+00  1.221e+00   1.685  0.09239 .  
Comp.11      4.676e+00  1.705e+00   2.742  0.00626 ** 
Comp.12      1.172e+00  1.463e+01   0.080  0.93622    
Comp.13     -1.151e+03  1.599e+03  -0.720  0.47157    
Comp.14      6.463e+03  8.713e+03   0.742  0.45849    
Comp.15     -1.598e+13  4.511e+14  -0.035  0.97175    
Comp.16      4.659e+14  5.236e+14   0.890  0.37385    
Comp.17     -7.220e+14  7.018e+14  -1.029  0.30388    
Comp.18     -8.021e+14  5.156e+14  -1.556  0.12020    
Comp.19      6.811e+14  4.034e+14   1.689  0.09174 .  
Comp.20     -9.114e+14  7.418e+14  -1.229  0.21965    
Comp.21      8.990e+14  4.395e+14   2.045  0.04118 *  
Comp.22      1.915e+14  5.597e+14   0.342  0.73239    
Comp.23      4.025e+14  6.637e+14   0.606  0.54438    
Comp.24      6.202e+14  5.767e+14   1.076  0.28247    
Comp.25      4.627e+13  6.108e+14   0.076  0.93964    
Comp.26     -1.313e+15  6.324e+14  -2.076  0.03828 *  
Comp.27     -1.328e+15  6.912e+14  -1.921  0.05515 .  
Comp.28     -5.138e+14  7.692e+14  -0.668  0.50438    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3528 on 729 degrees of freedom
Multiple R-squared:  0.5023,    Adjusted R-squared:  0.4832 
F-statistic: 26.28 on 28 and 729 DF,  p-value: < 2.2e-16

2.

d <- read.csv2("bank.csv")
d

converte “yes” to 1

d$y <- as.numeric(d$y == "yes")
d

convert X columns

d$rowID <- 1:nrow(d)
#head(d)
# columns to convert
columns_to_convert <- names(d) %in% c("job", "marital", "education", "default", "housing", "loan","contact","poutcome","rowID")
#columns_to_convert
selected_columns_data <- d[columns_to_convert]
# new data column converted
new_d_cc <- recast(selected_columns_data, rowID ~ variable + value, id.var = c("rowID"), fun.aggregate = function(x) (length(x) > 0) + 0L)
attributes are not identical across measure variables; they will be dropped
new_d_cc
original_columns <- d[!columns_to_convert]
new_d_cc <- cbind(original_columns, new_d_cc)
new_d_cc$rowID <- NULL
new_d_cc
new_d_cc <- data.frame(data.matrix(new_d_cc))
new_d_cc

2.a)

sample_ind <- sample(nrow(new_d_cc), 3000)
d_train <- d[sample_ind,]
d_test <- d[-sample_ind,]
plot(d_train$y)

mod.lda3 <-lda(y~., data = d[sample_ind,])
mod.lda3
Call:
lda(y ~ ., data = d[sample_ind, ])

Prior probabilities of groups:
        0         1 
0.8896667 0.1103333 

Group means:
       age jobblue-collar jobentrepreneur jobhousemaid jobmanagement jobretired jobself-employed
0 41.07981      0.2180592      0.03709254   0.02547771     0.2094417 0.04196328       0.04233795
1 42.39577      0.1389728      0.03625378   0.01812689     0.2326284 0.09667674       0.03625378
  jobservices jobstudent jobtechnician jobunemployed  jobunknown maritalmarried maritalsingle
0  0.09291870 0.01611090     0.1783439    0.02922443 0.007118771      0.6309479     0.2529037
1  0.06344411 0.03927492     0.1752266    0.03021148 0.012084592      0.5075529     0.3383686
  educationsecondary educationtertiary educationunknown defaultyes  balance housingyes   loanyes
0          0.5133009         0.2922443       0.04083927 0.01760959 1423.451  0.5811165 0.1659798
1          0.4743202         0.3625378       0.03323263 0.01812689 1537.154  0.4561934 0.0815710
  contacttelephone contactunknown      day  monthaug    monthdec   monthfeb   monthjan  monthjul
0       0.06444361      0.3195954 16.04646 0.1427501 0.003372049 0.04533533 0.03259648 0.1637317
1       0.07250755      0.1208459 15.60423 0.1510574 0.021148036 0.06948640 0.03625378 0.1238671
    monthjun    monthmar  monthmay   monthnov   monthoct    monthsep duration campaign    pdays
0 0.11352567 0.007493443 0.3252154 0.08542525 0.01198951 0.007118771 226.0367 2.878606 36.67141
1 0.09969789 0.045317221 0.1933535 0.07250755 0.05135952 0.033232628 572.7341 2.193353 71.57402
   previous poutcomeother poutcomesuccess poutcomeunknown    rowID
0 0.4825777    0.04121394      0.01124016       0.8385163 2256.931
1 1.1873112    0.08761329      0.16012085       0.6404834 2310.508

Coefficients of linear discriminants:
                             LD1
age                 5.526507e-03
jobblue-collar     -2.706623e-01
jobentrepreneur    -1.101781e-01
jobhousemaid       -4.140922e-01
jobmanagement      -1.513013e-01
jobretired          1.724787e-01
jobself-employed   -1.321104e-01
jobservices        -1.716546e-01
jobstudent          2.436472e-01
jobtechnician      -1.323998e-01
jobunemployed      -1.829038e-01
jobunknown          2.229728e-01
maritalmarried     -2.186959e-01
maritalsingle      -3.863537e-02
educationsecondary  1.465623e-02
educationtertiary   1.404901e-01
educationunknown   -1.826217e-01
defaultyes          3.462973e-01
balance            -3.249330e-06
housingyes         -1.046023e-02
loanyes            -2.021002e-01
contacttelephone    1.099550e-02
contactunknown     -5.924556e-01
day                 9.017458e-03
monthaug           -9.305973e-02
monthdec            2.389995e-01
monthfeb            1.174320e-01
monthjan           -5.780757e-01
monthjul           -3.597957e-01
monthjun            3.453750e-01
monthmar            1.672657e+00
monthmay           -1.382086e-01
monthnov           -4.616188e-01
monthoct            7.447575e-01
monthsep            7.407785e-01
duration            3.370065e-03
campaign           -1.714388e-02
pdays               4.166779e-04
previous            3.380204e-02
poutcomeother       5.260135e-01
poutcomesuccess     3.088828e+00
poutcomeunknown     2.067692e-01
rowID               2.232217e-05
#head(TAB3)
predicted3 <- predict(mod.lda3, newdata = d_test)
TAB3 <- table(d_test$y, predicted3$class)
TAB3
   
       0    1
  0 1287   44
  1  114   76

Result

mklrate<- 1-sum(diag(TAB3))/sum(TAB3)
mklrate
[1] 0.103879

2.b)

I used ‘balanced’ sampling strategy with 13% of no and 80% of yes. Thus I made a new ’balanced ’train sample of ~1000 samples and tested on test sample with 200 samples what did not improve the recall.

library(BalancedSampling)
# Select sample
set.seed(123);
N = 4521; # population size
n = 3000; # sample size
p = rep(n/N,N); # inclusion probabilities
X = cbind(p,runif(N),runif(N)); # matrix of auxiliary variables
s = cube(p,X); # select sample
head(s)
[1] 1020 2956 3194  866  799  352
# add row id
new_d_cc$id <- 1:nrow(new_d_cc)
group_no <- new_d_cc[ which(new_d_cc$y==0),]
group_yes <- new_d_cc[ which(new_d_cc$y==1),]
group_no
group_yes
#install.packages("splitstackshape", dependencies = TRUE)
library(splitstackshape)
set.seed(1)
sample.balanced.no <- stratified(group_no, c("y"), .13)
sample.balanced.no
sample.balanced.yes <- stratified(group_yes, c("y"), .8)
sample.balanced.yes
sample.combined <- rbind(sample.balanced.no, sample.balanced.yes)
sample.combined
commonID<-intersect(sample.combined$id, new_d_cc$id)
test_data_dd <- new_d_cc[-commonID,]
test_data_dd
plot(sample.combined$y)

sample.combined
test_data_dd$id<-NULL
sample.combined$id<-NULL
nrow(test_data_dd)
[1] 3584
mod.lda4 <-lda(y~., data = sample.combined)
variables are collinear
mod.lda4
Call:
lda(y ~ ., data = sample.combined)

Prior probabilities of groups:
        0         1 
0.5549626 0.4450374 

Group means:
       age  balance      day    month duration campaign    pdays  previous job_admin. job_blue.collar
0 40.87500 1475.883 15.83846 6.748077 236.6788 2.938462 34.73269 0.4173077  0.1115385       0.2211538
1 42.21343 1605.247 15.88249 6.235012 551.8321 2.225420 69.70743 1.1294964  0.1175060       0.1342926
  job_entrepreneur job_housemaid job_management job_retired job_self.employed job_services job_student
0       0.03653846    0.02307692      0.1884615  0.04615385        0.03846154   0.10961538  0.01538462
1       0.02637890    0.02158273      0.2565947  0.10311751        0.02877698   0.07434053  0.04076739
  job_technician job_unemployed job_unknown marital_divorced marital_married marital_single
0      0.1750000     0.03076923 0.003846154        0.1288462       0.6134615      0.2576923
1      0.1606715     0.02158273 0.014388489        0.1462830       0.5275779      0.3261391
  education_primary education_secondary education_tertiary education_unknown default_no default_yes
0         0.1634615           0.5173077          0.2653846        0.05384615  0.9769231  0.02307692
1         0.1175060           0.4868106          0.3573141        0.03836930  0.9832134  0.01678657
  housing_no housing_yes   loan_no   loan_yes contact_cellular contact_telephone contact_unknown
0  0.4500000   0.5500000 0.8500000 0.15000000        0.6173077        0.04230769       0.3403846
1  0.5779376   0.4220624 0.9232614 0.07673861        0.8033573        0.08393285       0.1127098
  poutcome_failure poutcome_other poutcome_success poutcome_unknown
0       0.09807692     0.04038462      0.005769231        0.8557692
1       0.13429257     0.07434053      0.158273381        0.6330935

Coefficients of linear discriminants:
                              LD1
age                 -3.600979e-03
balance              1.638638e-05
day                  8.219898e-04
month               -5.822340e-03
duration             2.626851e-03
campaign            -5.699570e-02
pdays               -5.055202e-04
previous             4.032731e-02
job_admin.           9.864568e-02
job_blue.collar     -1.750189e-01
job_entrepreneur    -2.127936e-01
job_housemaid       -2.202679e-01
job_management       1.643140e-01
job_retired          6.500307e-01
job_self.employed   -8.875918e-02
job_services        -4.159753e-01
job_student          6.414710e-01
job_technician      -5.230941e-02
job_unemployed      -8.520246e-01
job_unknown          9.209075e-01
marital_divorced     1.198250e-01
marital_married     -9.593275e-02
marital_single       4.481963e-02
education_primary   -1.215675e-01
education_secondary -1.578210e-03
education_tertiary   1.497721e-01
education_unknown   -3.642761e-01
default_no           3.426875e-02
default_yes         -3.426875e-02
housing_no           6.846746e-02
housing_yes         -6.846746e-02
loan_no              2.680189e-01
loan_yes            -2.680189e-01
contact_cellular     2.447616e-01
contact_telephone    4.014668e-01
contact_unknown     -4.261276e-01
poutcome_failure    -1.810792e-01
poutcome_other       8.899898e-02
poutcome_success     1.592406e+00
poutcome_unknown    -4.995688e-01

Confusion matrix

predicted4 <- predict(mod.lda4, newdata = test_data_dd)
TAB4 <- table(test_data_dd$y, predicted4$class)
TAB4
   
       0    1
  0 3063  417
  1   40   64

Result with no balanced sampling

 0    1

0 1298 44 1 98 81

Result

mklrate<- 1-sum(diag(TAB4))/sum(TAB4)
mklrate
[1] 0.1275112
---
title: "Advanced Methods for Regression and Classification - Exercise 4 Experiment - Dzenan Hamzic, TU Wien"
output: html_notebook
---

```{r}
#install.packages("magrittr")
#install.packages("dplyr", dependencies=TRUE)
library(reshape2)
library(magrittr)
library(MASS)
library(klaR)
```



```{r}
### Load Data
data(OJ, package = "ISLR")
 # count rows
data <- na.omit(OJ)
nrow(data)
```


```{r}
head(data)
```

```{r}
str(data)
```



```{r}
contrasts(data$Purchase)
```


## 1.a) 

#### Create class variables
```{r}
new_data <- data
new_dataY <- array(rep(0), dim=c(nrow(data),2))
for(row in 1:nrow(data)){
  if(new_data[row,1] == "CH"){
    new_dataY[row,1] = 1
  }else
    new_dataY[row, 2] = 1
}

head(new_dataY)
```

```{r}
head(new_data)
#new_data[,19] <- new_dataY
#head(new_data)
```
#### Create class variables
```{r}
# set CH as 1. 0 othervise
new_data$Purchase <- as.numeric(new_data$Purchase == "CH")
head(new_data)
```
```{r}
# add new columns of categorization target variable
#ndata <- data.frame(data.matrix(cbind(new_data,new_dataY)))
#head(ndata)
```

#### Convert categorical to binary
```{r}
new_data <- data.frame(data.matrix(new_data))
```



#### Split data randomly with 70/30 splits (train and test)
```{r}
set.seed(123)
ind <- sample(c(TRUE, FALSE), nrow(new_data), replace=TRUE, prob=c(0.7, 0.3))
train_data <- new_data[ind, ]
test_data <- new_data[!ind, ]
train_data
```

#### Model
```{r}
mod.ind <- lm(Purchase ~ ., data = train_data)
#mod.ind2 <- lm(lpurchase ~., data = train_data)
#summary(mod.ind)
mod.ind
```



#### Predict
```{r}
pred1 <- predict(mod.ind, newdata = test_data)
head(pred1)
```
#### Results
```{r}
predicted <- predict(mod.ind, newdata = test_data)
TAB <- table(test_data$Purchase, predicted >= 0.5)
head(TAB)
```

#### Missclasification rate
```{r}
mklrate<- 1-sum(diag(TAB))/sum(TAB)
mklrate
```
Missclasification rate with indicator variable is 18.5%


```{r}
head(new_data)
```

## 1.b) LDA
I have tried transforming categorical and factor fariables to matrices with equal distances.
Furthermore, I used principal component scores as new X data to resolve colinearity/singularity.
#### One train one test set
```{r}
mod.lda<-lda(Purchase~., data = train_data)
mod.lda
```

#### Results
```{r}
predicted <- predict(mod.lda, newdata = test_data)
TAB <- table(test_data$Purchase, predicted$class)
head(TAB)
```

#### Missclasification rate
```{r}
mklrate<- 1-sum(diag(TAB))/sum(TAB)
mklrate
```
```{r}
train_data
```

#### Cross Validation
```{r}
mypred = function(object,newdata) UseMethod("mypred", object)
mypred.lda <- function(object, newdata){
  predict(object, newdata=newdata)$class
}
library(ipred)
CEE = control.errorest(k = 5, nboot=10)
ldacvest <- errorest(Purchase ~ ., data = data, model=lda, est.para=CEE, predict=mypred)
ldacvest
```


#### binary and/or categorical columns to convert 
#### "StoreID", "SpecialCH", "SpecialMM", "Store7", "STORE", "rowID"
#### converte columns of the whole dataset
```{r}
new_data$rowID <- 1:nrow(new_data)
head(new_data)

# columns to convert
columns_to_convert <- names(new_data) %in% c("StoreID", "SpecialCH", "SpecialMM", "Store7", "STORE", "rowID")
#columns_to_convert
selected_columns_data <- new_data[columns_to_convert]

# new data column converted
new_data_cc <- recast(selected_columns_data, rowID ~ variable + value, id.var = c("rowID"), fun.aggregate = function(x) (length(x) > 0) + 0L)

head(new_data_cc)
```

```{r}
original_columns <- new_data[!columns_to_convert]
new_data_columns_converted <- cbind(original_columns, new_data_cc)
new_data_columns_converted$rowID <- NULL
new_data_columns_converted
```

#### Split converted data randomly on 70/30 splits (train and test)
```{r}
set.seed(123)
ind <- sample(c(TRUE, FALSE), nrow(new_data_columns_converted), replace=TRUE, prob=c(0.7, 0.3))
# train data converted columns
train_data_cc <- new_data_columns_converted[ind, ]
# test data converted columns
test_data_cc <- new_data_columns_converted[!ind, ]
train_data_cc
```

#### LDA on converted X
```{r}
mod.lda2 <-lda(Purchase~., data = train_data_cc)
mod.lda2
```
```{r}
plot(mod.lda2)
```


#### Predict with converted X
```{r}
predicted <- predict(mod.lda2, newdata = test_data_cc)
TAB <- table(test_data_cc$Purchase, predicted$class)
head(TAB)
```

#### Missclasification rate (Converted X)
```{r}
mklrate<- 1-sum(diag(TAB))/sum(TAB)
mklrate
```

### PCA-LDA
```{r}
names(train_data_cc[-1])
```


```{r}
X_train = train_data_cc[-1]
X_test = test_data_cc[-1]
pca_object = princomp(X_train, center=TRUE)
summary(pca_object)
```
```{r}
plot(pca_object)
```



```{r}
biplot(pca_object)
```


#### Use scores as new X
```{r}
X_train_scores <- pca_object$scores
head(X_train_scores)
```
```{r}
new_df_train_pc <- data.frame(X_train_scores)
new_df_test <- data.frame(X_test)
#colnames(new_df) = column_names
# train
new_df_train_pc["Purchase"] <- train_data_cc$Purchase
```


#### Project test data on principal components
```{r}
test.data_pc <- predict(pca_object, newdata = new_df_test)
test.data_pc <- as.data.frame(test.data_pc)
test.data_pc["Purchase"] <- test_data_cc$Purchase
head(test.data_pc)
```

```{r}
head(new_df_train_pc)
```

#### Predict with N principal components
```{r}
mat.pc.lda <- lda(Purchase ~Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9, new_df_train_pc)
plot(mat.pc.lda)  
```
```{r}
test.data_pc[c("Comp.1","Comp.2")]
```

#### Predict using Principal components
```{r}
predicted <- predict(mat.pc.lda, newdata = test.data_pc[c("Comp.1","Comp.2","Comp.3","Comp.4","Comp.5","Comp.6","Comp.7","Comp.8","Comp.9")])
#predicted$class
TAB <- table(test.data_pc$Purchase, predicted$class)
head(TAB)
```

#### Missclasification rate PCA-LDA on 9 first components
```{r}
mklrate<- 1-sum(diag(TAB))/sum(TAB)
mklrate
```


```{r}
head(new_df_train_pc)
head(test.data_pc)
```



## 1.c) QDA
Singularity problem can be solved by using principal components
```{r}
mod.qda<-qda(formula = Purchase~Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10+Comp.11+Comp.12, data = new_df_train_pc)
predictqda <- predict(mod.qda, newdata = test.data_pc)
```

#### Missclasification rate with N principal components and QDA
```{r}
#nrow(test_data)
#predictqda
TAB <- table(test.data_pc$Purchase, predictqda$class)
mkrqda<- 1-sum(diag(TAB)/sum(TAB))
mkrqda
```

```{r}
head(train_data_cc)
head(test_data_cc)
```


## 1.d) Regularized discriminant analysis
```{r}
mod.rda<-rda(Purchase~., data = train_data_cc)
#mod.rda$lambda
predictrda<-predict(mod.rda,test_data_cc)
#predictrda
TAB<-table(test_data_cc$Purchase, predictrda$class)
mkrrda<-1-sum(diag(TAB))/sum(TAB)
mkrrda
```
#### gamma and lambda
# Lambda in range [0,1] is there to keep the degrees of freedom flexible. It is a compromise between LDA(alpha=0) and QDA(alpha=1). It is used to select between joint and independent group covariances.
# Gamma is a regularization parameter that shrinks the covariance matrix towards the average eigenvalue.

#### Result with PCAs
```{r}
mod.rda<-rda(Purchase ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10+Comp.11+Comp.12+Comp.13+Comp.14+Comp.15+Comp.16+Comp.17, data = new_df_train_pc, train.fraction=0.6)

mod.rda$lambda
predictrda<-predict(mod.rda,test.data_pc)
#predictrda
TAB<-table(test_data_cc$Purchase, predictrda$class)
mkrrda<-1-sum(diag(TAB))/sum(TAB)
mkrrda
```
#### Trying to estimate most important variables
```{r}
mod<-lm(Purchase~.,data = new_df_train_pc)
summary(mod)
```

## 2. 
```{r}
d <- read.csv2("bank.csv")
d
```

#### converte "yes" to 1
```{r}
d$y <- as.numeric(d$y == "yes")
d
```

#### convert X columns
```{r}
d$rowID <- 1:nrow(d)
#head(d)

# columns to convert
columns_to_convert <- names(d) %in% c("job", "marital", "education", "default", "housing", "loan","contact","poutcome","rowID")
#columns_to_convert
selected_columns_data <- d[columns_to_convert]

# new data column converted
new_d_cc <- recast(selected_columns_data, rowID ~ variable + value, id.var = c("rowID"), fun.aggregate = function(x) (length(x) > 0) + 0L)

new_d_cc
```

```{r}
original_columns <- d[!columns_to_convert]
new_d_cc <- cbind(original_columns, new_d_cc)
new_d_cc$rowID <- NULL
new_d_cc
```

```{r}
new_d_cc <- data.frame(data.matrix(new_d_cc))
new_d_cc
```

## 2.a)
```{r}
sample_ind <- sample(nrow(new_d_cc), 3000)
d_train <- d[sample_ind,]
d_test <- d[-sample_ind,]
```

```{r}
plot(d_train$y)
```


```{r}
mod.lda3 <-lda(y~., data = d[sample_ind,])
mod.lda3
#head(TAB3)

```
```{r}
predicted3 <- predict(mod.lda3, newdata = d_test)
TAB3 <- table(d_test$y, predicted3$class)
TAB3
```


#### Result
```{r}
mklrate<- 1-sum(diag(TAB3))/sum(TAB3)
mklrate
```

## 2.b)
I used 'balanced' sampling strategy with 13% of no and 80% of yes. Thus I made a new 'balanced 'train sample of ~1000 samples and tested on test sample with 200 samples what did not improve the recall. 

```{r}
library(BalancedSampling)
# Select sample
set.seed(123);
N = 4521; # population size
n = 3000; # sample size
p = rep(n/N,N); # inclusion probabilities
X = cbind(p,runif(N),runif(N)); # matrix of auxiliary variables
s = cube(p,X); # select sample
head(s)
```
```{r}
# add row id
new_d_cc$id <- 1:nrow(new_d_cc)

group_no <- new_d_cc[ which(new_d_cc$y==0),]
group_yes <- new_d_cc[ which(new_d_cc$y==1),]
group_no
group_yes
```



```{r}
#install.packages("splitstackshape", dependencies = TRUE)
library(splitstackshape)
set.seed(1)
sample.balanced.no <- stratified(group_no, c("y"), .13)
sample.balanced.no
sample.balanced.yes <- stratified(group_yes, c("y"), .8)
sample.balanced.yes

```

```{r}
sample.combined <- rbind(sample.balanced.no, sample.balanced.yes)
sample.combined
```


```{r}
commonID<-intersect(sample.combined$id, new_d_cc$id)
test_data_dd <- new_d_cc[-commonID,]
test_data_dd
```



```{r}
plot(sample.combined$y)
```
```{r}
sample.combined
```
```{r}
test_data_dd$id<-NULL
sample.combined$id<-NULL
```

```{r}
nrow(test_data_dd)
```


```{r}
mod.lda4 <-lda(y~., data = sample.combined)
mod.lda4
```
#### Confusion matrix
```{r}
predicted4 <- predict(mod.lda4, newdata = test_data_dd)
TAB4 <- table(test_data_dd$y, predicted4$class)
TAB4
```
Result with no balanced sampling

     0    1
  0 1298   44
  1   98   81


#### Result
```{r}
mklrate<- 1-sum(diag(TAB4))/sum(TAB4)
mklrate
```






